/*    Copyright 2010 Tobias Marschall
 *
 *    This file is part of MoSDi.
 *
 *    MoSDi is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    MoSDi is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with MoSDi.  If not, see <http://www.gnu.org/licenses/>.
 */

package mosdi.subcommands;

import java.util.ArrayList;
import java.util.List;

import mosdi.discovery.EvaluatedPattern;
import mosdi.discovery.MotifFinder;
import mosdi.discovery.ObjectiveFunction;
import mosdi.discovery.objectives.OccurrenceCountObjective;
import mosdi.discovery.strategies.BruteForceSearch;
import mosdi.discovery.strategies.OptimumSearch;
import mosdi.discovery.strategies.ThresholdSearch;
import mosdi.fa.Alphabet;
import mosdi.fa.FiniteMemoryTextModel;
import mosdi.fa.IIDTextModel;
import mosdi.fa.MarkovianTextModel;
import mosdi.index.SuffixTree;
import mosdi.index.SuffixTreeWalker;
import mosdi.util.BitArray;
import mosdi.util.Iupac;
import mosdi.util.IupacStringConstraints;
import mosdi.util.Log;
import mosdi.util.NamedSequence;
import mosdi.util.SequenceUtils;

public class SimpleDiscoverySubcommand extends Subcommand {
	
	@Override
	public String usage() {
		return
		super.usage()+" [options] <pattern-length> <fasta-file>\n" +
		"\n" +
		"General options\n" +
		"  -i: replace unknown characters in DNA sequences by $\n" +
		"\n" +
		"Controlling search STRATEGY:\n" +
		"  -T <pvalue-threshold>: Search for all motifs <= threshold (default: optimal motif only)\n" +
		"  -B: Perform brute force search: evaluate all motifs\n" +
		"\n" +
		"Controlling MOTIF SPACE to be searched:\n" +
		"  -m: minimum number of letters from {A,C,G,T},{W,S,R,Y,K,M},{B,D,H,V},{N}, default: \"0,0,0,0\"\n" +
		"  -M: maximum number of letters from {A,C,G,T},{W,S,R,Y,K,M},{B,D,H,V},{N}, default: \"<length>,0,0,<length>/2\"\n" +
		"  -l <lower-bound>: only consider patterns s.t. lower-bound<=pattern\n" +
		"  -u <upper-bound>: only consider patterns s.t. pattern<upper-bound\n" +
		"  -E <max-expectation>: upper bound for expectation (motifs with higher expectation\n" +
		"                       are not returned.)\n"+ 
		"  -r: consider reverse complement\n" +
		"\n" +
		"Other options:\n" +
//		"  -c <exp-clump-size-bound>: do not calculate upper bound for expected clump size dymanically,\n" +
//		"                             but use given threshold instead.\n" +
		"  -C <good-enough-bound>: when clump size bounds are computed dynamically, no new calculations\n" +
		"                          are made when a bound <=good_enough_bound is known (default: 1.01)\n"+
		"  -O <model-order>: use background model of given order (default: 0)";
	}
	
	@Override
	public String description() {
		return "Runs motif discovery on (constrained) IUPAC patterns."; 
	}

	@Override
	public String name() {
		return "discovery";
	}

	@Override
	public int run(String[] args) {
		parseOptions(args, 2, "iT:Bm:M:l:u:E:rC:O:");

		// Option dependencies
		exclusiveOptions("B", "T");

		// Mandatory arguments
		int patternLength = getIntArgument(0);
		String filename = getStringArgument(1);

		// Options
		final Alphabet iupacAlphabet = Alphabet.getIupacAlphabet();
		final Alphabet dnaAlphabet = Alphabet.getDnaAlphabet();
		boolean considerReverse = getBooleanOption("r", false);
		int[] minFrequencies = {0,0,0,0};
		minFrequencies = getRangedIntTupleOption("m", 4, 0, patternLength, minFrequencies);
		int[] maxFrequencies = {patternLength,0,0,patternLength/2};
		maxFrequencies = getRangedIntTupleOption("M", 4, 0, patternLength, maxFrequencies);
		double pValueThreshold = getRangedDoubleOption("T", 0.0, 1.0, -1.0);
		boolean pValueThresholdAndMode = getBooleanOption("a", false);
		boolean bruteForce = getBooleanOption("B", false);
		int modelOrder = getNonNegativeIntOption("O", 0);
		int[] lowerBound = getStringOption("l", iupacAlphabet, null);
		int[] upperBound = getStringOption("u", iupacAlphabet, null);
		double maxExpectation = getRangedDoubleOption("E", 0.0, Double.POSITIVE_INFINITY, Double.POSITIVE_INFINITY);
		double goodEnoughClumpSizeBound = getRangedDoubleOption("C", 1.0, Double.POSITIVE_INFINITY, 1.01);
		boolean replaceUnknownByMinusOne = getBooleanOption("i", false);
		
		List<NamedSequence> namedSequences = null;
		Log.startTimer();
		try{
			namedSequences = SequenceUtils.readFastaFile(filename, dnaAlphabet, replaceUnknownByMinusOne);
		} catch (Exception e) {
			Log.errorln(e.toString());
			System.exit(1);
		}
		Log.stopTimer("Reading FASTA file");
		int effectiveLength = 0;
		for (NamedSequence ns : namedSequences) effectiveLength+=ns.length()-patternLength+1;
		Log.startTimer();
		List<int[]> sequences = new ArrayList<int[]>(namedSequences.size());
		for (NamedSequence ns : namedSequences) sequences.add(ns.getSequence());
		FiniteMemoryTextModel textModel = (modelOrder==0)?new IIDTextModel(dnaAlphabet.size(), sequences):new MarkovianTextModel(modelOrder,dnaAlphabet.size(),sequences);
		Log.stopTimer("Estimating text model");
		
		SuffixTree suffixTree = new SuffixTree();
		// allow sequences to be garbage collected
		sequences = null;
		int[] concatSequence = SequenceUtils.concatSequences(namedSequences, considerReverse);
		namedSequences = null;
		suffixTree.buildTree(concatSequence, dnaAlphabet.size(), patternLength);
		concatSequence = null;
		if (Log.levelAtLeast(Log.Level.DEBUG)) {
			SuffixTreeWalker walker = suffixTree.getWalker();
			BitArray wildcard = new BitArray(dnaAlphabet.size());
			wildcard.invert();
			for (int i=1; i<=patternLength; ++i) {
				int[] nodes = walker.forward(wildcard);
				Log.printf(Log.Level.DEBUG, "Nodes for prefix length %d: %,d (memory: %,d bytes)\n",i,nodes.length,nodes.length*suffixTree.getNodeSize());
			}
		}
		int[] occurrenceCountAnnotation = suffixTree.calcOccurrenceCountAnnotation();
		
		Log.startTimer();
//		FactorialCalculator fc = new FactorialCalculator(10000);
		BitArray[] generalizedAlphabet = Iupac.toGeneralizedString("ABCDGHKMNRSTVWY").getPositions();
		MotifFinder motifFinder = new MotifFinder(suffixTree, generalizedAlphabet, considerReverse);
		
		// IidPValueSearch search = new IidPValueSearch(sequences, textModel.getCharacterDistribution(), pValueThreshold, occurrenceCountAnnotation, false, expectedClumpSizeBound);
		int[] maxMatchCountAnnotation = suffixTree.calcMaxMatchCountAnnotation(patternLength);
		
		List<ObjectiveFunction> objectives = new ArrayList<ObjectiveFunction>();
		OccurrenceCountObjective o = new OccurrenceCountObjective(textModel, occurrenceCountAnnotation, maxMatchCountAnnotation, effectiveLength);
		o.setGoodEnoughClumpSizeBound(goodEnoughClumpSizeBound);
		if (!Double.isInfinite(maxExpectation)) o.setMaxExpectation(maxExpectation);
		objectives.add(o);
		
		MotifFinder.SearchSpecification search = null;
		if (bruteForce) {
			if (Log.levelAtLeast(Log.Level.DEBUG)) {
				search = new BruteForceSearch(objectives, iupacAlphabet, System.out);
			} else {
				search = new BruteForceSearch(objectives, iupacAlphabet, null);
			}
		} else if (pValueThreshold!=-1.0) {
			search = new ThresholdSearch(objectives,  pValueThreshold, System.out, pValueThresholdAndMode);
		} else {
			search = new OptimumSearch(objectives);
		}
		Log.startTimer();
		motifFinder.findIupacPatterns(patternLength, new IupacStringConstraints(minFrequencies,maxFrequencies), search, lowerBound, upperBound);
		Log.stopTimer("Search for IUPAC pattern on suffix tree");
		double timeMotifDiscovery = Log.getLastPeriodCpu();
		Log.print(Log.Level.VERBOSE, "------------------- RESULTS -------------------\n");
		Log.println(Log.Level.STANDARD, "Format: >> pattern >objective> score minus-log-pvalue pvalue");
		for (EvaluatedPattern m : search.getResults()) {
			// StringBuilder sb = new StringBuilder();
			// sb.append(String.format(">>p_value>> %e LIN ", m.getPValue()));
			// sb.append(String.format(">>stats>> %s %d ", (m.getPattern()==null)?"null":iupacAlphabet.buildString(m.getPattern()), m.getScore()));
			Log.println(Log.Level.STANDARD, m.toString(iupacAlphabet, objectives));
		}
		//Log.printf(Log.Level.STANDARD, ">>> %s %d %d %t %e%n", search.getMotifsEvaluated(), (long)Math.round(Math.exp(logNumber)), timeInstanceSearch, bestPValue);
		Log.printf(Log.Level.VERBOSE, ">>!>total_time>>: %t %n", timeMotifDiscovery);
		return 0;
	}
}
